In [1]:
import pandas as pd
In [2]:
Superstore_sales_data = pd.read_csv("C://Users//Dell//OneDrive//Documents//SampleSuperstore.csv")
In [3]:
# Display the first few rows of the dataset
Superstore_sales_data.head()
Out[3]:
Ship Mode Segment Country City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit
0 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Bookcases 261.9600 2 0.00 41.9136
1 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Chairs 731.9400 3 0.00 219.5820
2 Second Class Corporate United States Los Angeles California 90036 West Office Supplies Labels 14.6200 2 0.00 6.8714
3 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Furniture Tables 957.5775 5 0.45 -383.0310
4 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Office Supplies Storage 22.3680 2 0.20 2.5164
In [4]:
# Calculate the middle index
middle_index = len(Superstore_sales_data) // 2

# Extract middle rows
middle_rows = Superstore_sales_data.iloc[middle_index-5:middle_index+5]  # Extract 10 rows around the middle

# Display the middle rows
middle_rows
Out[4]:
Ship Mode Segment Country City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit
4992 Standard Class Corporate United States Memphis Tennessee 38109 South Office Supplies Envelopes 55.936 8 0.2 18.8784
4993 Standard Class Corporate United States Memphis Tennessee 38109 South Office Supplies Labels 18.432 8 0.2 5.9904
4994 Standard Class Corporate United States Memphis Tennessee 38109 South Furniture Furnishings 20.320 5 0.2 3.5560
4995 Standard Class Home Office United States New York City New York 10011 East Office Supplies Binders 52.064 4 0.2 18.8732
4996 Standard Class Corporate United States Raleigh North Carolina 27604 South Office Supplies Appliances 48.784 1 0.2 3.6588
4997 Standard Class Corporate United States Raleigh North Carolina 27604 South Office Supplies Binders 13.092 4 0.7 -10.0372
4998 Standard Class Home Office United States Los Angeles California 90045 West Technology Phones 109.592 1 0.2 8.2194
4999 Standard Class Home Office United States Los Angeles California 90045 West Office Supplies Paper 56.700 5 0.0 27.7830
5000 Standard Class Home Office United States Los Angeles California 90004 West Technology Accessories 79.990 1 0.0 28.7964
5001 Same Day Home Office United States Newport News Virginia 23602 South Office Supplies Supplies 69.500 5 0.0 20.1550
In [5]:
# Display the last few rows of the dataset
Superstore_sales_data.tail()
Out[5]:
Ship Mode Segment Country City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit
9989 Second Class Consumer United States Miami Florida 33180 South Furniture Furnishings 25.248 3 0.2 4.1028
9990 Standard Class Consumer United States Costa Mesa California 92627 West Furniture Furnishings 91.960 2 0.0 15.6332
9991 Standard Class Consumer United States Costa Mesa California 92627 West Technology Phones 258.576 2 0.2 19.3932
9992 Standard Class Consumer United States Costa Mesa California 92627 West Office Supplies Paper 29.600 4 0.0 13.3200
9993 Second Class Consumer United States Westminster California 92683 West Office Supplies Appliances 243.160 2 0.0 72.9480
In [6]:
# Check the dimensions of the dataset (rows, columns)
print("Dimensions of the dataset:", Superstore_sales_data.shape)
Dimensions of the dataset: (9994, 13)
In [7]:
#Get an overview of the columns and their data types
Superstore_sales_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Ship Mode     9994 non-null   object 
 1   Segment       9994 non-null   object 
 2   Country       9994 non-null   object 
 3   City          9994 non-null   object 
 4   State         9994 non-null   object 
 5   Postal Code   9994 non-null   int64  
 6   Region        9994 non-null   object 
 7   Category      9994 non-null   object 
 8   Sub-Category  9994 non-null   object 
 9   Sales         9994 non-null   float64
 10  Quantity      9994 non-null   int64  
 11  Discount      9994 non-null   float64
 12  Profit        9994 non-null   float64
dtypes: float64(3), int64(2), object(8)
memory usage: 1015.1+ KB
In [8]:
# Display basic statistics for numerical columns
Superstore_sales_data.describe()
Out[8]:
Postal Code Sales Quantity Discount Profit
count 9994.000000 9994.000000 9994.000000 9994.000000 9994.000000
mean 55190.379428 229.858001 3.789574 0.156203 28.656896
std 32063.693350 623.245101 2.225110 0.206452 234.260108
min 1040.000000 0.444000 1.000000 0.000000 -6599.978000
25% 23223.000000 17.280000 2.000000 0.000000 1.728750
50% 56430.500000 54.490000 3.000000 0.200000 8.666500
75% 90008.000000 209.940000 5.000000 0.200000 29.364000
max 99301.000000 22638.480000 14.000000 0.800000 8399.976000
In [9]:
# Check for missing values in each column
print("Missing values per column:")
print(Superstore_sales_data.isnull().any())
Missing values per column:
Ship Mode       False
Segment         False
Country         False
City            False
State           False
Postal Code     False
Region          False
Category        False
Sub-Category    False
Sales           False
Quantity        False
Discount        False
Profit          False
dtype: bool
In [10]:
# Check data types of each column
print("Data types of each column:")
print(Superstore_sales_data.dtypes)
Data types of each column:
Ship Mode        object
Segment          object
Country          object
City             object
State            object
Postal Code       int64
Region           object
Category         object
Sub-Category     object
Sales           float64
Quantity          int64
Discount        float64
Profit          float64
dtype: object
In [11]:
# Check for duplicate rows
print("Number of duplicate rows:", Superstore_sales_data.duplicated().sum())
Number of duplicate rows: 17
In [12]:
# Remove duplicate rows
Superstore_sales_data = Superstore_sales_data.drop_duplicates()
In [13]:
# Check the dimensions of the dataset (rows, columns)
print("Dimensions of the dataset:", Superstore_sales_data.shape)
Dimensions of the dataset: (9977, 13)
In [14]:
# Calculate total sales
total_sales = Superstore_sales_data['Sales'].sum()
print("Total Sales:", total_sales)
Total Sales: 2296195.5903
In [15]:
# Calculate average order value
average_order_value = Superstore_sales_data['Sales'].mean()

print("Average Order Value:", average_order_value)
Average Order Value: 230.14890150345792
In [16]:
# Calculate total quantity sold
total_quantity_sold = Superstore_sales_data['Quantity'].sum()

print("Total Quantity Sold:", total_quantity_sold)
Total Quantity Sold: 37820
In [17]:
import matplotlib.pyplot as plt
import seaborn as sns
In [18]:
# Calculate basic descriptive statistics for key metrics
total_sales = Superstore_sales_data['Sales'].sum()
average_order_value = Superstore_sales_data['Sales'].mean()
max_sales = Superstore_sales_data['Sales'].max()
min_sales = Superstore_sales_data['Sales'].min()

print("Total Sales:", total_sales)
print("Average Order Value:", average_order_value)
print("Maximum Sales:", max_sales)
print("Minimum Sales:", min_sales)
Total Sales: 2296195.5903
Average Order Value: 230.14890150345792
Maximum Sales: 22638.48
Minimum Sales: 0.444
In [19]:
# Visualize the distribution of sales
plt.figure(figsize=(10, 6))
sns.histplot(Superstore_sales_data['Sales'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Sales')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.show()
In [20]:
# Visualize the distribution of order quantity
plt.figure(figsize=(10, 6))
sns.histplot(Superstore_sales_data['Quantity'], bins=30, kde=True, color='orange')
plt.title('Distribution of Order Quantity')
plt.xlabel('Quantity')
plt.ylabel('Frequency')
plt.show()
In [21]:
# Visualize the distribution of profit
plt.figure(figsize=(10, 6))
sns.histplot(Superstore_sales_data['Profit'], bins=30, kde=True, color='green')
plt.title('Distribution of Profit')
plt.xlabel('Profit')
plt.ylabel('Frequency')
plt.show()
In [22]:
# Visualize sales by region
plt.figure(figsize=(10, 6))
sns.barplot(x='Region', y='Sales', data=Superstore_sales_data, estimator=sum, ci=None)
plt.title('Total Sales by Region')
plt.xlabel('Region')
plt.ylabel('Total Sales')
plt.show()
C:\Users\Dell\AppData\Local\Temp\ipykernel_6912\2317922662.py:3: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(x='Region', y='Sales', data=Superstore_sales_data, estimator=sum, ci=None)
In [23]:
# Create a unique identifier for each customer
Superstore_sales_data['Customer ID'] = Superstore_sales_data['Country'] + '_' + Superstore_sales_data['City'] + '_' + Superstore_sales_data['State']

# Check the updated dataframe
Superstore_sales_data.head()
Out[23]:
Ship Mode Segment Country City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit Customer ID
0 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Bookcases 261.9600 2 0.00 41.9136 United States_Henderson_Kentucky
1 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Chairs 731.9400 3 0.00 219.5820 United States_Henderson_Kentucky
2 Second Class Corporate United States Los Angeles California 90036 West Office Supplies Labels 14.6200 2 0.00 6.8714 United States_Los Angeles_California
3 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Furniture Tables 957.5775 5 0.45 -383.0310 United States_Fort Lauderdale_Florida
4 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Office Supplies Storage 22.3680 2 0.20 2.5164 United States_Fort Lauderdale_Florida
In [24]:
# Let's create a sample 'Order Date' column using synthetic date data
import numpy as np

# Generate synthetic date data for demonstration
np.random.seed(0)
Superstore_sales_data['Order Date'] = pd.to_datetime(np.random.choice(pd.date_range('2019-01-01', '2022-01-01'), len(Superstore_sales_data)))

# Check the updated dataframe
Superstore_sales_data.head()

# Now you can proceed with the product analysis using the 'Order Date' column
Out[24]:
Ship Mode Segment Country City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit Customer ID Order Date
0 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Bookcases 261.9600 2 0.00 41.9136 United States_Henderson_Kentucky 2020-11-15
1 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Chairs 731.9400 3 0.00 219.5820 United States_Henderson_Kentucky 2020-07-13
2 Second Class Corporate United States Los Angeles California 90036 West Office Supplies Labels 14.6200 2 0.00 6.8714 United States_Los Angeles_California 2021-04-15
3 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Furniture Tables 957.5775 5 0.45 -383.0310 United States_Fort Lauderdale_Florida 2021-02-02
4 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Office Supplies Storage 22.3680 2 0.20 2.5164 United States_Fort Lauderdale_Florida 2021-10-30
In [25]:
# create the Order ID column
# Generate unique order IDs
Superstore_sales_data['Order ID'] = Superstore_sales_data.groupby(Superstore_sales_data['Order Date'].dt.date).ngroup() + 1

# Display the updated DataFrame with Order ID column
Superstore_sales_data.head()
Out[25]:
Ship Mode Segment Country City State Postal Code Region Category Sub-Category Sales Quantity Discount Profit Customer ID Order Date Order ID
0 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Bookcases 261.9600 2 0.00 41.9136 United States_Henderson_Kentucky 2020-11-15 685
1 Second Class Consumer United States Henderson Kentucky 42420 South Furniture Chairs 731.9400 3 0.00 219.5820 United States_Henderson_Kentucky 2020-07-13 560
2 Second Class Corporate United States Los Angeles California 90036 West Office Supplies Labels 14.6200 2 0.00 6.8714 United States_Los Angeles_California 2021-04-15 836
3 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Furniture Tables 957.5775 5 0.45 -383.0310 United States_Fort Lauderdale_Florida 2021-02-02 764
4 Standard Class Consumer United States Fort Lauderdale Florida 33311 South Office Supplies Storage 22.3680 2 0.20 2.5164 United States_Fort Lauderdale_Florida 2021-10-30 1034
In [26]:
# Calculate RFM metrics
# Example: Calculate Recency, Frequency, and Monetary metrics
# Recency: Calculate the number of days since the last purchase for each customer
recency_Superstore_sales_data = Superstore_sales_data.groupby('Customer ID')['Order Date'].max().reset_index()
recency_Superstore_sales_data['Recency'] = (pd.to_datetime('today') - recency_Superstore_sales_data['Order Date']).dt.days

# Frequency: Calculate the number of orders for each customer
frequency_Superstore_sales_data = Superstore_sales_data.groupby('Customer ID')['Order ID'].count().reset_index()
frequency_Superstore_sales_data.columns = ['Customer ID', 'Frequency']

# Monetary: Calculate the total amount spent by each customer
monetary_Superstore_sales_data = Superstore_sales_data.groupby('Customer ID')['Sales'].sum().reset_index()
monetary_Superstore_sales_data.columns = ['Customer ID', 'Monetary']

# Merge the RFM metrics
rfm_Superstore_sales_data = recency_Superstore_sales_data.merge(frequency_Superstore_sales_data, on='Customer ID').merge(monetary_Superstore_sales_data, on='Customer ID')


# Display the RFM DataFrame
rfm_Superstore_sales_data.head()
Out[26]:
Customer ID Order Date Recency Frequency Monetary
0 United States_Aberdeen_South Dakota 2021-03-17 1142 1 25.500
1 United States_Abilene_Texas 2020-06-30 1402 1 1.392
2 United States_Akron_Ohio 2021-08-05 1001 21 2729.986
3 United States_Albuquerque_New Mexico 2021-11-15 899 14 2220.160
4 United States_Alexandria_Virginia 2021-12-25 859 16 5519.570
In [27]:
# After calculating the RFM metrics, you can proceed with customer segmentation based on these metrics. 
# Here's an example of how you can perform customer segmentation using RFM scores
# Calculate RFM scores
rfm_Superstore_sales_data['RecencyScore'] = pd.qcut(rfm_Superstore_sales_data['Recency'], q=4, labels=False)
rfm_Superstore_sales_data['FrequencyScore'] = pd.qcut(rfm_Superstore_sales_data['Frequency'], q=4, labels=False)
rfm_Superstore_sales_data['MonetaryScore'] = pd.qcut(rfm_Superstore_sales_data['Monetary'], q=4, labels=False)

# Calculate RFM total score
rfm_Superstore_sales_data['RFM_Score'] = rfm_Superstore_sales_data['RecencyScore'] + rfm_Superstore_sales_data['FrequencyScore'] + rfm_Superstore_sales_data['MonetaryScore']

# Define segmentation labels
segment_labels = ['Low Value', 'Mid Value', 'High Value']

# Assign segment labels based on RFM score
rfm_Superstore_sales_data['Segment'] = pd.cut(rfm_Superstore_sales_data['RFM_Score'], bins=3, labels=segment_labels)

# Display the segmented DataFrame
rfm_Superstore_sales_data.head()
Out[27]:
Customer ID Order Date Recency Frequency Monetary RecencyScore FrequencyScore MonetaryScore RFM_Score Segment
0 United States_Aberdeen_South Dakota 2021-03-17 1142 1 25.500 2 0 0 2 Low Value
1 United States_Abilene_Texas 2020-06-30 1402 1 1.392 3 0 0 3 Mid Value
2 United States_Akron_Ohio 2021-08-05 1001 21 2729.986 2 3 3 8 High Value
3 United States_Albuquerque_New Mexico 2021-11-15 899 14 2220.160 1 3 2 6 High Value
4 United States_Alexandria_Virginia 2021-12-25 859 16 5519.570 0 3 3 6 High Value
In [28]:
from sklearn.cluster import KMeans
In [29]:
# Select relevant features for customer segmentation
data = Superstore_sales_data[['Sales', 'Quantity', 'Profit']]
In [30]:
#Customer Segmentation using K-Means Clustering
# Define the number of clusters (segments)
num_clusters = 3

# Initialize the KMeans model
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the model to the data
kmeans.fit(data)

# Add the cluster labels to the original DataFrame
Superstore_sales_data['Cluster'] = kmeans.labels_

# Display the count of customers in each cluster
print(Superstore_sales_data['Cluster'].value_counts())
C:\Users\Dell\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
0    9494
1     468
2      15
Name: Cluster, dtype: int64
In [31]:
#Analyze Customer Segments
# Analyze the characteristics of each customer segment
segment_analysis = Superstore_sales_data.groupby('Cluster').agg({
    'Sales': ['mean', 'sum'],
    'Quantity': ['mean', 'sum'],
    'Profit': ['mean', 'sum']
}).reset_index()

segment_analysis
Out[31]:
Cluster Sales Quantity Profit
mean sum mean sum mean sum
0 0 136.037294 1.291538e+06 3.684432 34980 13.820554 131212.3360
1 1 1806.675550 8.455242e+05 5.893162 2758 248.616325 116352.4399
2 2 10608.891067 1.591334e+05 5.466667 82 2578.443113 38676.6467
In [32]:
#  Visualize Customer Segments
# Visualize customer segments based on Sales, Quantity, and Profit
plt.figure(figsize=(12, 8))
sns.scatterplot(data=Superstore_sales_data, x='Sales', y='Quantity', hue='Cluster', palette='viridis', size='Profit', sizes=(20, 200))
plt.title('Customer Segmentation')
plt.xlabel('Sales')
plt.ylabel('Quantity')
plt.legend(title='Cluster')
plt.show()
In [33]:
import matplotlib.pyplot as plt
import seaborn as sns

# Segment customers based on purchasing behavior
# For example, let's segment customers into high-value and frequent customers
# You can define your segmentation criteria based on your business requirements

# Calculate total sales and frequency of orders for each customer
customer_data = Superstore_sales_data.groupby('Customer ID').agg({'Sales': 'sum', 'Order Date': 'count'}).reset_index()
customer_data.columns = ['Customer ID', 'Total Sales', 'Order Count']

# Determine threshold values for high-value and frequent customers
high_value_threshold = customer_data['Total Sales'].quantile(0.8)
frequent_customer_threshold = customer_data['Order Count'].quantile(0.8)

# Segment customers based on thresholds
customer_data['Customer Segment'] = 'Regular'
customer_data.loc[(customer_data['Total Sales'] >= high_value_threshold) & (customer_data['Order Count'] >= frequent_customer_threshold), 'Customer Segment'] = 'High-Value, Frequent'

# Visualize customer segmentation
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Total Sales', y='Order Count', hue='Customer Segment', data=customer_data)
plt.title('Customer Segmentation based on Purchasing Behavior')
plt.xlabel('Total Sales')
plt.ylabel('Order Count')
plt.legend(title='Customer Segment')
plt.grid(True)
plt.show()
In [34]:
# Top-Selling Products and Categories
# Identify top-selling products
top_products = Superstore_sales_data.groupby('Sub-Category')['Quantity'].sum().sort_values(ascending=False).head(10)
print("Top Selling Products:")
print(top_products)

# Identify top-selling categories
top_categories = Superstore_sales_data.groupby('Category')['Quantity'].sum().sort_values(ascending=False)
print("\nTop Selling Categories:")
print(top_categories)
Top Selling Products:
Sub-Category
Binders        5971
Paper          5144
Furnishings    3560
Phones         3289
Storage        3158
Art            2996
Accessories    2976
Chairs         2351
Appliances     1729
Labels         1396
Name: Quantity, dtype: int64

Top Selling Categories:
Category
Office Supplies    22861
Furniture           8020
Technology          6939
Name: Quantity, dtype: int64
In [35]:
# Performance of Products Over Time
# Convert 'Order Date' to datetime if not already in datetime format
Superstore_sales_data['Order Date'] = pd.to_datetime(Superstore_sales_data['Order Date'])

# Extract year and month from 'Order Date'
Superstore_sales_data['Year'] = Superstore_sales_data['Order Date'].dt.year
Superstore_sales_data['Month'] =  Superstore_sales_data['Order Date'].dt.month
Superstore_sales_data['Month'] =  Superstore_sales_data['Order Date'].dt.month

# Group by year and month to analyze performance over time
product_performance =  Superstore_sales_data.groupby(['Year', 'Month', 'Sub-Category'])['Quantity'].sum().reset_index()

# Plot performance of top-selling product over time
top_product_name = top_products.index[0]
top_product_performance = product_performance[product_performance['Sub-Category'] == top_product_name]

plt.figure(figsize=(12, 6))
sns.lineplot(data=top_product_performance, x='Month', y='Quantity', hue='Year', marker='o')
plt.title(f"Performance of {top_product_name} Over Time")
plt.xlabel("Month")
plt.ylabel("Quantity Sold")
plt.legend(title='Year')
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.grid(True)
plt.show()
In [36]:
# Prepare the Data
# Convert 'Order Date' to datetime if not already in datetime format
Superstore_sales_data['Order Date'] = pd.to_datetime(Superstore_sales_data['Order Date'])

# Extract date components for further analysis (e.g., day of week, month, year)Superstore_sales_data
Superstore_sales_data['Day_of_Week'] = Superstore_sales_data['Order Date'].dt.dayofweek
Superstore_sales_data['Month'] = Superstore_sales_data['Order Date'].dt.month
Superstore_sales_data['Year'] = Superstore_sales_data['Order Date'].dt.year
In [37]:
# Daily Sales Trend
# Calculate daily sales
daily_sales = Superstore_sales_data.groupby('Order Date')['Sales'].sum()

# Plot daily sales trend
plt.figure(figsize=(12, 6))
daily_sales.plot()
plt.title('Daily Sales Trend')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.grid(True)
plt.show()
In [38]:
# Monthly Sales Trend
# Calculate monthly sales
monthly_sales = Superstore_sales_data.groupby(pd.Grouper(key='Order Date', freq='M'))['Sales'].sum()

# Plot monthly sales trend
plt.figure(figsize=(12, 6))
monthly_sales.plot(marker='o')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.grid(True)
plt.show()
In [39]:
# Yearly Sales Trend
# Calculate yearly sales
yearly_sales = Superstore_sales_data.groupby(pd.Grouper(key='Order Date', freq='Y'))['Sales'].sum()

# Plot yearly sales trend
plt.figure(figsize=(12, 6))
yearly_sales.plot(kind='bar')
plt.title('Yearly Sales Trend')
plt.xlabel('Year')
plt.ylabel('Sales')
plt.grid(True)
plt.show()
In [40]:
# Seasonal Decomposition

from statsmodels.tsa.seasonal import seasonal_decompose

# Perform seasonal decomposition
result = seasonal_decompose(daily_sales, model='multiplicative', period=30)  # Assuming a seasonal period of 30 days

# Plot decomposition
plt.figure(figsize=(12, 10))

plt.subplot(4, 1, 1)
result.observed.plot(title='Observed')

plt.subplot(4, 1, 2)
result.trend.plot(title='Trend')

plt.subplot(4, 1, 3)
result.seasonal.plot(title='Seasonal')

plt.subplot(4, 1, 4)
result.resid.plot(title='Residuals')

plt.tight_layout()
plt.show()
In [41]:
# Analyze Customer Behavior
# Example: Visualize customer segments based on sales and profit
plt.figure(figsize=(10, 6))
sns.scatterplot(data=Superstore_sales_data, x='Sales', y='Profit', hue='Segment')
plt.title('Customer Segments based on Sales and Profit')
plt.xlabel('Sales')
plt.ylabel('Profit')
plt.legend(title='Segment')
plt.grid(True)
plt.show()
In [42]:
# Identify Popular Products and Categories
# Example: Visualize top-selling products and categories
top_products = Superstore_sales_data.groupby('Sub-Category')['Quantity'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(12, 6))
top_products.plot(kind='bar')
plt.title('Top 10 Selling Products')
plt.xlabel('Product')
plt.ylabel('Quantity Sold')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()

top_categories = Superstore_sales_data.groupby('Category')['Quantity'].sum().sort_values(ascending=False)
plt.figure(figsize=(8, 6))
top_categories.plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Sales by Category')
plt.ylabel('')
plt.show()
In [43]:
# Analyze Sales Trends
# Example: Visualize monthly sales trend
monthly_sales = Superstore_sales_data.groupby(pd.Grouper(key='Order Date', freq='M'))['Sales'].sum()
plt.figure(figsize=(12, 6))
monthly_sales.plot(marker='o')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.grid(True)
plt.show()
In [44]:
# Create Dashboards
!pip install plotly
Requirement already satisfied: plotly in c:\users\dell\anaconda3\lib\site-packages (5.9.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\dell\anaconda3\lib\site-packages (from plotly) (8.2.2)
In [45]:
import plotly.express as px
In [46]:
# Create Dashboard Components
# Example: Create a bar chart for top-selling products
top_products = Superstore_sales_data.groupby('Sub-Category')['Quantity'].sum().sort_values(ascending=False).head(10).reset_index()
bar_chart = px.bar(top_products, x='Sub-Category', y='Quantity', title='Top 10 Selling Products')
In [47]:
# Example: Create a pie chart for sales distribution by category
category_sales = Superstore_sales_data.groupby('Category')['Sales'].sum().reset_index()
pie_chart = px.pie(category_sales, values='Sales', names='Category', title='Sales Distribution by Category')
In [48]:
# Example: Create a line chart for total sales over time
sales_over_time = Superstore_sales_data.groupby('Order Date')['Sales'].sum().reset_index()
line_chart = px.line(sales_over_time, x='Order Date', y='Sales', title='Total Sales Over Time')
In [49]:
# Example: Create a scatter plot for sales vs. profit
scatter_plot = px.scatter(Superstore_sales_data, x='Sales', y='Profit', title='Sales vs. Profit')
In [50]:
# Example: Create a histogram for sales distribution
histogram = px.histogram(Superstore_sales_data, x='Sales', title='Sales Distribution')
In [51]:
# Example: Create a heatmap for correlation matrix
correlation_matrix = Superstore_sales_data.corr()
heatmap = px.imshow(correlation_matrix, color_continuous_scale='Viridis', title='Correlation Matrix')
C:\Users\Dell\AppData\Local\Temp\ipykernel_6912\2261162957.py:2: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

In [52]:
# Example: Create a box plot for sales distribution by region
box_plot = px.box(Superstore_sales_data, x='Region', y='Sales', title='Sales Distribution by Region')
In [53]:
# Combine Dashboard Components
# Combine charts into a dashboard layout
dashboard_layout = [bar_chart, pie_chart, line_chart, scatter_plot, histogram, heatmap, box_plot]
In [54]:
# Display Dashboard
# Display the dashboard
for chart in dashboard_layout:
    chart.show()
In [ ]: